import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
#plotly imports
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)
sns.set()
sensex_df = pd.read_csv('SENSEX (S&P BSE SENSEX) data.csv',parse_dates=['Date'])
sensex_df.head()
| Date | Open | High | Low | Close | Adj Close | Volume | |
|---|---|---|---|---|---|---|---|
| 0 | 2015-12-07 | 25746.029297 | 25785.529297 | 25477.689453 | 25530.109375 | 25530.109375 | 8700.0 |
| 1 | 2015-12-08 | 25488.419922 | 25542.470703 | 25256.789063 | 25310.330078 | 25310.330078 | 12600.0 |
| 2 | 2015-12-09 | 25299.339844 | 25316.949219 | 25012.220703 | 25036.050781 | 25036.050781 | 11600.0 |
| 3 | 2015-12-10 | 25136.710938 | 25289.580078 | 25034.140625 | 25252.320313 | 25252.320313 | 10900.0 |
| 4 | 2015-12-11 | 25281.769531 | 25316.140625 | 24930.429688 | 25044.429688 | 25044.429688 | 14900.0 |
100 * sensex_df.isnull().sum() / len(sensex_df)
Date 0.000000 Open 0.405186 High 0.405186 Low 0.405186 Close 0.405186 Adj Close 0.405186 Volume 0.405186 dtype: float64
sensex_df.fillna(method='bfill',inplace=True)
fig = px.line(sensex_df, x='Date', y=['Open','High','Low','Close'],
title='SENSEX (S&P BSE SENSEX) Time Series')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
fig = px.line(sensex_df, x='Date', y='Volume',
title='SENSEX (S&P BSE SENSEX) - VOLUME')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
news_data = pd.read_csv('india-news-headlines.csv')
news_data.head()
| publish_date | headline_category | headline_text | |
|---|---|---|---|
| 0 | 20010101 | sports.wwe | win over cena satisfying but defeating underta... |
| 1 | 20010102 | unknown | Status quo will not be disturbed at Ayodhya; s... |
| 2 | 20010102 | unknown | Fissures in Hurriyat over Pak visit |
| 3 | 20010102 | unknown | America's unwanted heading for India? |
| 4 | 20010102 | unknown | For bigwigs; it is destination Goa |
# Checking for null values in India news headlines Data
100 * news_data.isnull().sum() / len(news_data)
publish_date 0.0 headline_category 0.0 headline_text 0.0 dtype: float64
Algorithm for predicting Sentiments by Un-Supervised method
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os.path
import pickle
from tqdm import tqdm
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
import multiprocessing
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
[nltk_data] Downloading package punkt to C:\Users\Shubham [nltk_data] Singh\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to C:\Users\Shubham [nltk_data] Singh\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to C:\Users\Shubham [nltk_data] Singh\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
headlines = list(news_data.headline_text.values)
def preprocess_headlines(headlines):
if not os.path.isfile('preprocessed_headlines.pkl'):
preprocessed_headlines = []
for headline in tqdm(headlines):
# make every token into lower case words
tokens = word_tokenize(headline)
tokens = [token.lower() for token in tokens]
# Remove punctuation if any
table = str.maketrans("","",string.punctuation)
stripped = [w.translate(table) for w in tokens ]
words = [word for word in stripped if word.isalpha()]
# Remove Stop words
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_words = [wordnet_lemmatizer.lemmatize(i,'v') for i in words]
preprocessed_headlines.append(lemmatized_words)
print('Headlines Preprocessed! Saving the preprocessed headlines...')
with open('preprocessed_headlines.pkl', 'wb') as f:
pickle.dump(preprocessed_headlines, f)
print('Preprocessed Headline saved for use later')
else:
print('Opening saved preprocessed headlines')
with open('preprocessed_headlines.pkl', 'rb') as f:
preprocessed_headlines = pickle.load(f)
return preprocessed_headlines
preprocessed_headlines = preprocess_headlines(headlines)
Opening saved preprocessed headlines
cleaned_headlines = pd.DataFrame([' '.join(sent) for sent in preprocessed_headlines])
cleaned_headlines.columns = ['headlines']
cleaned_headlines.to_csv('clean_headlines.csv')
# I used gensim’s implementation of word2vec algorithm with CBOW architecture.
# I trained 300 dimensional embeddings with lookup window equal to 4, negative sampling was set to 20 words,
# sub-sampling to 1e-5, and learning rate decayed from 0.03 to 0.0007.
w2v_model = Word2Vec(min_count=3,
window=4,
size=300,
sample=1e-5,
alpha=0.03,
min_alpha=0.0007,
negative=20,
workers=multiprocessing.cpu_count()-1)
start = time()
w2v_model.build_vocab(preprocessed_headlines, progress_per=50000)
print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))
Time to build vocab: 0.32 mins
start = time()
w2v_model.train(preprocessed_headlines, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))
w2v_model.init_sims(replace=True)
Time to train the model: 12.81 mins
w2v_model.save("word2vec.model")
word_vectors = Word2Vec.load("word2vec.model").wv
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))
positive_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[1-positive_cluster_index]
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
# The sentiment coefficient is calculated using closeness scores and cluster values
# Cluster values are 1 for positive cluster and -1 for negative cluster
# The closeness score is the score to determine how close a word vector is, to the center of its own cluster
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value
words.drop(['cluster'],axis=1,inplace=True)
words.head()
| words | vectors | cluster_value | closeness_score | sentiment_coeff | |
|---|---|---|---|---|---|
| 0 | win | [-0.086885355, 0.041603863, 0.032596402, 0.020... | 1 | 1.039434 | 1.039434 |
| 1 | cena | [0.16243222, -0.016612386, -0.071292974, 0.020... | 1 | 1.088483 | 1.088483 |
| 2 | satisfy | [0.044864807, 0.01616498, 0.010574658, -0.0237... | -1 | 0.989017 | -0.989017 |
| 3 | defeat | [-0.052080262, -0.032441083, 0.032221995, -0.0... | -1 | 0.978982 | -0.978982 |
| 4 | undertaker | [0.080534264, 0.015349642, -0.00408135, -0.020... | 1 | 1.048805 | 1.048805 |
# Saving the words (vocabulary) and their sentiment coefficients
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(cleaned_headlines.headlines)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(cleaned_headlines.headlines)
D:\Anaconda\lib\site-packages\sklearn\feature_extraction\text.py:484: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
def create_tfidf_dictionary(x, transformed_file, features):
'''
create dictionary for each input sentence x, where each word has assigned its tfidf score
inspired by function from this wonderful article:
https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
x - row of dataframe, containing sentences, and their indexes,
transformed_file - all sentences transformed with TfidfVectorizer
features - names of all words in corpus used in TfidfVectorizer
'''
vector_coo = transformed_file[x.name].tocoo()
vector_coo.col = features.iloc[vector_coo.col].values
dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
return dict_from_coo
def replace_tfidf_words(x, transformed_file, features):
'''
replacing each word with it's calculated tfidf dictionary with scores of each word
x - row of dataframe, containing sentences, and their indexes,
transformed_file - all sentences transformed with TfidfVectorizer
features - names of all words in corpus used in TfidfVectorizer
'''
dictionary = create_tfidf_dictionary(x, transformed_file, features)
return list(map(lambda y:dictionary[f'{y}'], x.headlines.split()))
%%time
replaced_tfidf_scores = cleaned_headlines.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 10 minutes to calculate
Wall time: 10min 36s
def replace_sentiment_words(word, sentiment_dict):
'''
replacing each word with its associated sentiment score from sentiment dict
'''
try:
out = sentiment_dict[word]
except KeyError:
out = 0
return out
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))
replaced_closeness_scores = cleaned_headlines.headlines.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, cleaned_headlines.headlines]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
# At this point of time, we have two vectors of sentences:
# 1. Vector from the sentiment coefficients of each word
# 2. Vector from the TF-IDF score of each word
# We have to take the dot product of these vector to determine the sentiment rate.
# If the sentiment rate is positive => Sentiment = POSITIVE
# If the sentiment rate is negative => Sentiment = NEGATIVE
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.prediction]
replacement_df.head()
| sentiment_coeff | tfidf_scores | sentence | sentiment_rate | prediction | sentiment | |
|---|---|---|---|---|---|---|
| 0 | [1.0394342977025397, 1.088482632230146, -0.989... | [5.945566497068443, 13.175362648565239, 9.7999... | win cena satisfy defeat undertaker bigger roma... | 10.522846 | 1 | 1 |
| 1 | [-1.0111097989227964, -1.0176282771836704, -0.... | [7.560947264591127, 9.908257040601391, 9.64038... | status quo disturb ayodhya say vajpayee | -48.977423 | 0 | 0 |
| 2 | [-1.0085150605144462, -0.9947737321880664, -0.... | [11.326444765497236, 9.42102597779666, 6.32193... | fissure hurriyat pak visit | -33.438972 | 0 | 0 |
| 3 | [1.0033044744156854, -0.990119659324128, -1.00... | [8.94296262902374, 10.79364023501247, 6.851952... | america unwanted head india | -13.273439 | 0 | 0 |
| 4 | [-0.9919454123565428, -0.9868242601420738, -1.... | [10.262372802081302, 9.183115956366148, 6.4121... | bigwigs destination goa | -25.801407 | 0 | 0 |
news_data['Sentiment Prediction'] = replacement_df.sentiment
mapper = {0:'Negative',1:'Positive'}
news_data['Overall Predicted Sentiment'] = news_data['Sentiment Prediction'].map(mapper)
# Below are sme example of the result, which is pretty satisfactory
news_data.iloc[[2480072,276604,2700284,2498502,2449222,2717085,2177184,3191296,526745,2015259,1044300,1918820],:]
| publish_date | headline_category | headline_text | Sentiment Prediction | Overall Predicted Sentiment | |
|---|---|---|---|---|---|
| 2480072 | 20161112 | city.chennai | Stalin says Chennai facing water crisis | 0 | Negative |
| 276604 | 20040216 | business.india-business | Tata dreams to be a global brand | 1 | Positive |
| 2700284 | 20170923 | entertainment.hindi.bollywood | Anupam Kher: Shekhar Kapur is a true friend | 1 | Positive |
| 2498502 | 20161208 | city.goa | Proteges from A R Rahman Foundation to perform... | 1 | Positive |
| 2449222 | 20160928 | entertainment.english.music.news | Robbie Williams: I'm incredibly proud of my wife | 1 | Positive |
| 2717085 | 20171017 | city.hyderabad | Two killed in Warangal hospital fire | 0 | Negative |
| 2177184 | 20150905 | entertainment.english.music.news | Demi Lovato to dedicate single to her father | 1 | Positive |
| 3191296 | 20191202 | entertainment.hindi.bollywood | Anshuman Jha: It's satisfying that our film wa... | 1 | Positive |
| 526745 | 20071224 | city.pune | Goon murdered in Pune | 0 | Negative |
| 2015259 | 20150116 | sports.icc-world-t20-2016 | MS Dhoni exudes guarded optimism on World Cup ... | 1 | Positive |
| 1044300 | 20110201 | unknown | Ishant wishes best of luck to Team India | 1 | Positive |
| 1918820 | 20140828 | unknown | First ODI: Raina; spinners deliver emphatic win | 1 | Positive |
Algorithms for Time Series Forecasting
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
close_data = sensex_df['Close'].values
dates = sensex_df['Date'].values
from datetime import datetime
split_date = datetime(2019,12,31)
# Selecting the training data before 31-Dec-2017
train_df = sensex_df.loc[sensex_df['Date'] <= split_date]
train_close = train_df['Close'].values
train_dates = train_df['Date'].values
# Selecting the testing data before 31-Dec-2017
test_df = sensex_df.loc[sensex_df['Date'] > split_date]
test_close = test_df['Close'].values
test_dates = test_df['Date'].values
# Reshaping the training array to apply Standard Scaler
train_close_values = train_close.reshape(-1,1)
scaler = MinMaxScaler(feature_range=(0, 1))
train_close_scaled = scaler.fit_transform(train_close_values)
# creating data structure with 30 days as input and 31'st day as output
X_train = []
y_train = []
for i in range(30,len(train_close_scaled)):
X_train.append(train_close_scaled[i-30:i,0])
y_train.append(train_close_scaled[i,0])
X_train,y_train = np.array(X_train),np.array(y_train)
# Reshaping
X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))
def createLSTM_model(X_train):
regressor = Sequential()
# Adding 4 layers on LSTM
regressor.add(LSTM(units=50,return_sequences=True,input_shape=(X_train.shape[1],1)))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=50,return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=50,return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units=50))
regressor.add(Dropout(0.2))
# Adding output layer
regressor.add(Dense(units=1))
regressor.compile(optimizer='adam',loss='mean_squared_error')
print(regressor.summary())
return regressor
model = createLSTM_model(X_train)
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm (LSTM) (None, 30, 50) 10400 _________________________________________________________________ dropout (Dropout) (None, 30, 50) 0 _________________________________________________________________ lstm_1 (LSTM) (None, 30, 50) 20200 _________________________________________________________________ dropout_1 (Dropout) (None, 30, 50) 0 _________________________________________________________________ lstm_2 (LSTM) (None, 30, 50) 20200 _________________________________________________________________ dropout_2 (Dropout) (None, 30, 50) 0 _________________________________________________________________ lstm_3 (LSTM) (None, 50) 20200 _________________________________________________________________ dropout_3 (Dropout) (None, 50) 0 _________________________________________________________________ dense (Dense) (None, 1) 51 ================================================================= Total params: 71,051 Trainable params: 71,051 Non-trainable params: 0 _________________________________________________________________ None
early_stopping = EarlyStopping(monitor='val_loss',patience=3,mode='min')
model.fit(X_train,y_train,epochs=100,batch_size=32,callbacks=[tfdocs.modeling.EpochDots(),early_stopping],validation_split=0.2)
Epoch 1/100 25/25 [==============================] - ETA: 0s - loss: 0.0409 Epoch: 0, loss:0.0409, val_loss:0.0082, 25/25 [==============================] - 2s 60ms/step - loss: 0.0409 - val_loss: 0.0082 Epoch 2/100 25/25 [==============================] - 0s 10ms/step - loss: 0.0076 - val_loss: 0.0070 Epoch 3/100 25/25 [==============================] - 0s 10ms/step - loss: 0.0054 - val_loss: 0.0137 Epoch 4/100 25/25 [==============================] - 0s 10ms/step - loss: 0.0042 - val_loss: 0.0078 Epoch 5/100 25/25 [==============================] - 0s 10ms/step - loss: 0.0041 - val_loss: 0.0148
<tensorflow.python.keras.callbacks.History at 0x1e07b3973a0>
model.save("StockPricePrediction.h5")
# NOTE: The variable 'close_test_values' contains array of test data after split and 30 previous values from train data
# NOTE : The variable 'test_close' only contains the test data after the split
# This is done because, in order to predict the 1st value of test data, we need 30 previous values as training data
close_test_values = close_data[len(close_data) - len(test_close)-30:] # Contains the test data and 30 values from training data
# Reshaping the close_test_values (test data) to apply the Standand Scaler
close_test_values = close_test_values.reshape(-1,1)
inputs_scaled = scaler.transform(close_test_values)
X_test = []
for i in range(30,inputs_scaled.shape[0]):
X_test.append(inputs_scaled[i-30:i,0])
X_test = np.array(X_test)
# Reshape the test data as expected by LSTM
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
predicted_stock_price = model.predict(X_test)
predicted_stock_price = scaler.inverse_transform(predicted_stock_price)
plt.plot(test_close,color='red',label='Real Stock Price')
plt.plot(predicted_stock_price,color='green',label='Predicted Stock Price')
plt.xlabel("Time")
plt.ylabel("Close")
plt.legend()
<matplotlib.legend.Legend at 0x1e2403d5520>
sentiment_analysis_df = news_data.groupby(['publish_date','Overall Predicted Sentiment']).agg({'Overall Predicted Sentiment':'count'})
sentiment_analysis_df.columns = ['Count of Sentiments']
sentiment_analysis_df = sentiment_analysis_df.reset_index()
sentiment_analysis_df['publish_date'] = sentiment_analysis_df['publish_date'].apply(str).apply(lambda x: x[:4] + '-' + x[4:6] + '-' + x[6:]).apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
sentiment_analysis_df.head()
| publish_date | Overall Predicted Sentiment | Count of Sentiments | |
|---|---|---|---|
| 0 | 2001-01-01 | Positive | 1 |
| 1 | 2001-01-02 | Negative | 75 |
| 2 | 2001-01-02 | Positive | 11 |
| 3 | 2001-01-03 | Negative | 33 |
| 4 | 2001-01-03 | Positive | 8 |
positive_sentiments_df = sentiment_analysis_df[sentiment_analysis_df['Overall Predicted Sentiment'] == 'Positive']
fig = px.line(positive_sentiments_df, x='publish_date', y='Count of Sentiments',
title='Positive Sentiment Articles Trend')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
negative_sentiments_df = sentiment_analysis_df[sentiment_analysis_df['Overall Predicted Sentiment'] == 'Negative']
fig = px.line(negative_sentiments_df, x='publish_date', y='Count of Sentiments',
title='Negative Sentiment Articles Trend')
fig.update_xaxes(rangeslider_visible=True)
fig.show()
plt.figure(figsize=(20,8))
plt.plot(train_dates,train_close,color='blue',label='Real Training Stock Price')
plt.plot(test_dates,test_close,color='red',label='Real Testing Stock Price')
plt.plot(test_dates,predicted_stock_price,color='green',label='Predict Stock Price')
plt.xlabel("Time")
plt.ylabel("Stock Price")
plt.legend(loc=(1,1))
<matplotlib.legend.Legend at 0x1e0a9a90e50>